In [1]:
import pandas as pd
import numpy
import json
from collections import defaultdict
import scipy.stats
import math
import pywikibot
from matplotlib.pylab import style
style.use('fivethirtyeight')

%pylab inline
java_min_int = -2147483648


VERBOSE:pywiki:Starting 1 threads...
Populating the interactive namespace from numpy and matplotlib

In [2]:
allrecs = pd.DataFrame.from_dict(json.load(open('helpers/world_cultures_shortcut.json','r')))

In [6]:
#Tranforming QIDs into English labels.
enwp = pywikibot.Site('en','wikipedia')
wikidata = enwp.data_repository()

retrieved = dict()

def english_label(qid):
    
    if type(qid) is float:
        if math.isnan(qid):
            return None
    #first see if we've done it
    try:
        return retrieved[qid]
    except KeyError:
        try:
            page = pywikibot.ItemPage(wikidata, qid)
            data = page.get()
            lab = data['labels']['en']
            retrieved[qid] = lab
            return lab
        except (KeyError, pywikibot.exceptions.NoPage):
            retrieved[qid] = qid
            return qid
#Tranforming QIDs into English labels.
enwp = pywikibot.Site('en','wikipedia')
wikidata = enwp.data_repository()

In [7]:
allrecs['citname'] = allrecs['citizenship'].apply(english_label)
allrecs['countryname'] = allrecs['country'].apply(english_label)


VERBOSE:pywiki:Found 1 wikidata:wikidata processes running, including this one.

In [8]:
wikidatanames = set(allrecs['citname']).union(set(allrecs['countryname']))

In [9]:
def normname(name):
    name = name.replace('*','')
    try:
        return {'Iran, Islamic Rep.': 'Iran',
                        'Korea, Rep.':'South Korea',
                        'Brunei Darussalam': 'Brunei',
                        'United States':'United States of America',
                        'Slovak Republic':'Slovakia',
                        'China':"People's Republic of China",
                        'People’s Republic of China':"People's Republic of China",
                         'Kyrgyz Republic': 'Kyrgyzstan',
                         'Russian Federation': 'Russia',
                         'Macedonia, FYR': 'Republic of Macedonia',
                         'Lao PDR':'Laos',
                         'Bahamas':'The Bahamas',
                         u'C\xf4te d\u2019Ivoire':u"C\xf4te d'Ivoire",
                         'Côte d’Ivoire':u"C\xf4te d'Ivoire",
                         'Plu. St.. of Bolivia':'Bolivia',
                         'Viet Nam':'Vietnam',
                         'Myanmar':'Burma',
                         'Former Yugoslav Republic of Macedonia':'Macedonia',
                         'Lao People’s Democratic Republic':'Laos',
                         'Bolivarian Republic of Venezuela':'Venezuela',
                        'Republic of Moldova':'Moldova',
                         'Central African Rep.':'Central African Republic',
                        'Syrian Arab Republic':'Syria',
                        'Republic of Tanzania':'Tanzania',
                         'Palestine, State of':'Palestine',
                         'Moldova (Republic of)':'Moldova',
                        'Sao Tome and Principe': u'Sao Tom\xe9 and Pr\xedncipe',
                        "Lao People's Democratic Republic":'Laos',
                        'Venezuela (Bolivarian Republic of)':'Venezuela',
                        'The former Yugoslav Republic of Macedonia':'Macedonia',
                        'Iran (Islamic Republic of)':'Iran',
                        'Congo (Democratic Republic of the)': u'Democratic Republic of the Congo',
                        'Congo':u'Republic of the Congo',
                        'Tanzania (United Republic of)':'Tanzania',
                        'Hong Kong, China (SAR)':"People's Republic of China",
                        'Russian Federation':'Russia',
                        'Korea (Republic of)':'South Korea',
                        'Bolivia (Plurinational State of)':'Bolivia'}[name]
    except KeyError:
        return name

In [10]:
wef = pd.io.html.read_html('http://reports.weforum.org/global-gender-gap-report-2014/rankings/')[0]
wef['Economy'] = wef['Economy'].apply(normname)
wefnames = set(wef['Economy'])

In [11]:
geidirty = pd.io.html.read_html('http://www.socialwatch.org/node/14367')[2]
gei = geidirty.iloc[3:,6:8]
gei.columns = ['Economy', 'Score']
gei = gei.dropna()
gei["Rank"] = gei['Score'].rank(ascending=False).apply(lambda x: int(x))

In [12]:
def country_sigi_extract(text_line):
    '''put the first strings together as name and the first float as the sigi value'''
    economy = ''
    sigi_val = float()
    for w in text_line.split(' '):
        try:
            sigi_val  = float(w)
            break
        except ValueError:
            if economy:
                economy += ' '  + w
            else:
                economy = w
    return economy, sigi_val

ec_sigi = dict([country_sigi_extract(text_line) for text_line in  sigipdftext.split('\n')] )

sigi = pd.DataFrame.from_dict(ec_sigi, orient='index')
sigi['Economy'] = sigi.index
sigi['Economy'] = sigi['Economy'].apply(normname)
sigi['Score'] = 1-sigi[0]
sigi["Rank"] = sigi['Score'].rank(ascending=False).apply(lambda x: int(x))

In [13]:
gdidirty = pd.DataFrame.from_csv('helpers/foreign_indexes/Table_5__Gender-related_development_index.csv')
nar = gdidirty.iloc[1:,:3]
nar.columns = ['Economy', 'Score', 'Rank']

In [14]:
gdi = nar[(nar['Score'] != '..') & (nar['Rank'] != '—') ]

In [15]:
gdi['Score']  = gdi['Score'].apply(lambda x: float(x))
gdi['Rank']  = gdi['Rank'].apply(lambda x: int(x))
gdi.sort('Score')


WARNING: -c:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

WARNING:py.warnings:-c:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

WARNING: -c:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

WARNING:py.warnings:-c:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

Out[15]:
Economy Score Rank
HDI ranks
169 Afghanistan 0.602 148
187 Niger 0.714 147
154 Yemen 0.738 146
146 Pakistan 0.750 145
184 Chad 0.762 144
176 Mali 0.771 143
185 Central African Republic 0.776 142
179 Guinea 0.785 141
175 Liberia 0.786 140
183 Sierra Leone 0.799 139
161 Mauritania 0.801 138
120 Iraq 0.802 137
166 Togo 0.803 136
165 Benin 0.822 134
186 Congo (Democratic Republic of the) 0.822 134
129 Morocco 0.828 132
135 India 0.828 132
152 Nigeria 0.839 131
77 Jordan 0.842 130
93 Algeria 0.843 129
75 Iran (Islamic Republic of) 0.847 128
118 Syrian Arab Republic 0.851 127
173 Ethiopia 0.853 126
110 Egypt 0.855 125
163 Senegal 0.864 124
152 Cameroon 0.872 123
128 Timor-Leste 0.875 122
148 Swaziland 0.877 121
178 Mozambique 0.879 120
138 Ghana 0.884 118
... ... ... ...
117 Philippines 0.989 17
10 Denmark 0.989 17
20 France 0.989 17
96 Jamaica 0.989 17
89 Thailand 0.990 14
114 Moldova (Republic of) 0.990 14
14 United Kingdom 0.993 13
87 Armenia 0.994 8
64 Trinidad and Tobago 0.994 8
58 Bulgaria 0.994 8
5 United States 0.995 7
1 Norway 0.997 5
43 Hungary 0.998 4
67 Venezuela (Bolivarian Republic of) 0.999 2
37 Slovakia 1.000 1
49 Argentina 1.001 2
12 Sweden 1.004 6
24 Finland 1.006 8
25 Slovenia 1.006 8
35 Poland 1.010 14
83 Ukraine 1.012 21
50 Uruguay 1.015 25
70 Kazakhstan 1.015 25
53 Belarus 1.021 32
59 Barbados 1.021 32
103 Mongolia 1.021 32
48 Latvia 1.033 52
35 Lithuania 1.036 58
57 Russian Federation 1.038 61
33 Estonia 1.042 70

148 rows × 3 columns


In [19]:
geinames = set(gei['Economy'].apply(normname))
print geinames
unknown = gdinames.difference(wikidatanames)
for uk in unknown:
    print uk


set([u'Canada', u'Turkmenistan', u'Lithuania', u'Cambodia', u'Ethiopia', u'Sri Lanka', u'Swaziland', u'Argentina', u'Bolivia', u'Cameroon', u'Burkina Faso', u'Ghana', u'Saudi Arabia', u'Japan', u'Cape Verde', u'Slovenia', u'Guatemala', u'Bosnia and Herzegovina', u'Jordan', u'Congo, Rep.', u'Spain', u'Liberia', u'Netherlands', u'Pakistan', u'Oman', u'Tanzania', "People's Republic of China", u'Gabon', u'New Zealand', u'Yemen', u'Jamaica', u'Albania', u'United Arab Emirates', u'India', u'Azerbaijan', u'Lesotho', u'Kenya', 'South Korea', u'Tajikistan', u'Turkey', u'Afghanistan', u'Czech Republic', u'Eritrea', u'Mongolia', u'France', u'Rwanda', u'Slovakia', u'Congo, DR', u'Peru', u'Malawi', u'Benin', u'Singapore', u'United States of America', u'Togo', u'Armenia', u'Dominican Republic', u'Ukraine', u'Bahrain', u'Indonesia', u'Finland', u'Mauritius', u'Sweden', u'Belarus', u'Mali', 'Russia', u'Bulgaria', u'Romania', u'Angola', u'Portugal', u'South Africa', u'Nicaragua', u'Qatar', u'Malaysia', u'Austria', 'Vietnam', u'Mozambique', u'Uganda', u'Hungary', u'Niger', u'Brazil', u'Kuwait', u'Panama', u'Costa Rica', u'Luxembourg', u'Ireland', u'Ecuador', u'Bangladesh', 'Brunei', u'Australia', u'Iran', u'Algeria', u'El Salvador', u'Chile', u'Belgium', u'Thailand', u'Haiti', u'Belize', u'Sierra Leone', u'Georgia', u'Denmark', u'Poland', u'Moldova', u'Morocco', u'Namibia', u'Guinea-Bissau', u'Switzerland', u'Chad', u'Estonia', u'Uruguay', u'Equatorial Guinea', u'Lebanon', u'Uzbekistan', u'Djibouti', u'Colombia', u'Burundi', u'Cyprus', u'Madagascar', u'Italy', u'Bhutan', u'Sudan', u'Nepal', u'Malta', u'Maldives', u'Venezuela', u'Israel', u'Iceland', u'Zambia', u'Senegal', u'Papua New Guinea', u'Zimbabwe', u'Germany', u'Gambia', u'Kazakhstan', u'Philippines', u'Mauritania', u'Kyrgyzstan', u'Trinidad and Tobago', u'Latvia', u'Guyana', u'Syria', u"C\xf4te d'Ivoire", u'Honduras', u'Mexico', u'Egypt', u'Lao, PDR', u'Cuba', u'Serbia', u'Comoros', u'United Kingdom', u'Greece', u'Paraguay', u'Croatia', u'Botswana'])

In [18]:
gdinames = set(gdi['Economy'].apply(normname))
unknown = gdinames.difference(wikidatanames)
for uk in unknown:
    print uk

In [20]:
siginames = set(sigi['Economy']) 
unknown = siginames.difference(wikidatanames)
for uk in unknown:
    print uk

In [21]:
def calibrate_rank_corr(foreign_index, short=False):
    '''takes a foreign index and finds the window for which wigi most correlates with it'''
    corr_df = pd.DataFrame(columns=['start_year', 'bios_count', 'spearman', 'spearman_p', 'mannwhitneyu', 'mannwhitneyu_p', 'ranksum', 'ranksum_p'])

    if not short:
        some_modern_history = range(1000, 1800, 100) + range(1800, 1980, 10)
    else:
        some_modern_history = [1900]
    for start_year in some_modern_history:
        modrecs = allrecs[(allrecs['dob'] >= start_year) &(allrecs['dob'] < 1990)]
        cdf = modrecs[['country','citizenship','gender']]

        def combine_economy(row):
            cit = row['citizenship']
            cunt = row['country']
            return cit if cit else cunt
        cdf['Economy_qid'] = cdf.apply(combine_economy,axis=1)
        edf = cdf[cdf['Economy_qid'].apply(lambda x: x is not None)]
        bios_count = len(edf)

        edf['Economy'] = edf['Economy_qid'].apply(english_label)


        country_perc = defaultdict(dict)
        country_groups= edf.groupby(by='Economy')

        for country, group in country_groups:
            nonmale = group[group['gender'] != 'Q6581097']['gender'].count()
            total = group['gender'].count()
            nm_perc = nonmale / float(total)
            country_perc[country]['Economy'] = country #for later on joining
            country_perc[country]['Score'] = nm_perc #for later on joining
            country_perc[country]['total']= total

        wdf = pd.DataFrame.from_dict(country_perc, orient='index')

        wefnames = set(foreign_index['Economy'])

        wdf_matching = wdf[wdf['Economy'].apply(lambda x: x in wefnames)]
        wdf_matching['Rank'] = wdf_matching['Score'].rank(ascending=False).apply(lambda x: int(x))

        rank_compare = foreign_index.join(wdf_matching, on='Economy', how='left', rsuffix='-Wikidata')[['Economy','Rank','Rank-Wikidata','Score','Score-Wikidata']]
        rank_compare['diff'] = rank_compare['Rank'] - rank_compare['Rank-Wikidata']

        spearman_results = scipy.stats.spearmanr(rank_compare[['Rank','Rank-Wikidata']])
        spearman = spearman_results[0]
        spearman_p = spearman_results[1]

        def scale_col(col):
            num = col - min(col)
            denom = max(col) - min(col)
            return num / denom

        rank_compare['Score_norm'] = scale_col(rank_compare['Score'])
        rank_compare['Score_wikidata_norm'] = scale_col(rank_compare['Score-Wikidata'])

        mannwhitneyu, mannwhitneyu_p = scipy.stats.mannwhitneyu(rank_compare['Score_norm'],rank_compare['Score_wikidata_norm'])
        ranksum, ranksum_p = scipy.stats.ranksums(rank_compare['Score_norm'],rank_compare['Score_wikidata_norm'])

        corr_df = corr_df.append(dict(start_year=start_year,
                                                                bios_count=bios_count,
                                                                spearman=spearman,
                                                                spearman_p = spearman_p,
                                                                mannwhitneyu = mannwhitneyu,
                                                                mannwhitneyu_p = mannwhitneyu_p,
                                                                ranksum = ranksum,
                                                                ranksum_p = ranksum_p), ignore_index=True)
        
    return corr_df #todo just return the max spearman

In [22]:
sigi_corr_df = calibrate_rank_corr(sigi, short=False)


WARNING: -c:17: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

WARNING:py.warnings:-c:17: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

WARNING: -c:21: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

WARNING:py.warnings:-c:21: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

WARNING: -c:40: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

WARNING:py.warnings:-c:40: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-22-662364be4d25> in <module>()
----> 1 sigi_corr_df = calibrate_rank_corr(sigi, short=False)

<ipython-input-21-4fa05c099ae1> in calibrate_rank_corr(foreign_index, short)
     15             cunt = row['country']
     16             return cit if cit else cunt
---> 17         cdf['Economy_qid'] = cdf.apply(combine_economy,axis=1)
     18         edf = cdf[cdf['Economy_qid'].apply(lambda x: x is not None)]
     19         bios_count = len(edf)

/usr/local/lib/python2.7/dist-packages/pandas/core/frame.pyc in apply(self, func, axis, broadcast, raw, reduce, args, **kwds)
   3594                     if reduce is None:
   3595                         reduce = True
-> 3596                     return self._apply_standard(f, axis, reduce=reduce)
   3597             else:
   3598                 return self._apply_broadcast(f, axis)

/usr/local/lib/python2.7/dist-packages/pandas/core/frame.pyc in _apply_standard(self, func, axis, ignore_failures, reduce)
   3646                 labels = self._get_agg_axis(axis)
   3647                 result = lib.reduce(values, func, axis=axis, dummy=dummy,
-> 3648                                     labels=labels)
   3649                 return Series(result, index=labels)
   3650             except Exception:

/usr/local/lib/python2.7/dist-packages/pandas/lib.so in pandas.lib.reduce (pandas/lib.c:40234)()

/usr/local/lib/python2.7/dist-packages/pandas/lib.so in pandas.lib.Reducer.get_result (pandas/lib.c:30025)()

<ipython-input-21-4fa05c099ae1> in combine_economy(row)
     12 
     13         def combine_economy(row):
---> 14             cit = row['citizenship']
     15             cunt = row['country']
     16             return cit if cit else cunt

/usr/local/lib/python2.7/dist-packages/pandas/core/series.pyc in __getitem__(self, key)
    509             result = self.index.get_value(self, key)
    510 
--> 511             if not np.isscalar(result):
    512                 if is_list_like(result) and not isinstance(result, Series):
    513 

/usr/local/lib/python2.7/dist-packages/numpy/core/numeric.pyc in isscalar(num)
   1932 
   1933     """
-> 1934     if isinstance(num, generic):
   1935         return True
   1936     else:

KeyboardInterrupt: 

In [27]:
wef_corr_df = calibrate_rank_corr(wef, short=False)

In [ ]:
gdi_corr_df = calibrate_rank_corr(gdi, short=False)

In [ ]:
gei_corr_df = calibrate_rank_corr(gei, short=False)

In [ ]:
gei_corr_df

In [ ]:
sigi_corr_df

In [ ]:
gdi_corr_df

In [29]:
wef_corr_df


Out[29]:
start_year bios_count spearman spearman_p mannwhitneyu mannwhitneyu_p ranksum ranksum_p
0 1000 887006 0.263995 0.001501 6195.0 9.762673e-09 5.616870 1.944474e-08
1 1100 886514 0.265131 0.001429 6193.0 9.600785e-09 5.619760 1.912226e-08
2 1200 885697 0.265446 0.001410 6187.0 9.130603e-09 5.628431 1.818567e-08
3 1300 884571 0.265739 0.001392 6200.0 1.017908e-08 5.609645 2.027420e-08
4 1400 883044 0.263195 0.001553 6197.0 9.927211e-09 5.613980 1.977249e-08
5 1500 879276 0.262088 0.001628 6206.0 1.070155e-08 5.600975 2.131498e-08
6 1600 870495 0.265848 0.001385 6230.0 1.306420e-08 5.566294 2.602143e-08
7 1700 857099 0.265899 0.001382 6252.0 1.566935e-08 5.534503 3.121122e-08
8 1800 815661 0.270712 0.001120 6421.0 6.130025e-08 5.290291 1.221218e-07
9 1810 805811 0.276275 0.000874 6483.0 9.965155e-08 5.200699 1.985407e-07
10 1820 794371 0.277217 0.000838 6564.0 1.858238e-07 5.083650 3.702490e-07
11 1830 781921 0.276718 0.000857 6621.0 2.858085e-07 5.001283 5.694998e-07
12 1840 768581 0.282408 0.000661 6689.0 4.735930e-07 4.903020 9.437414e-07
13 1850 753693 0.290256 0.000458 6753.0 7.553225e-07 4.810538 1.505246e-06
14 1860 736868 0.295216 0.000362 6834.0 1.347622e-06 4.693490 2.685835e-06
15 1870 716540 0.292414 0.000414 6940.0 2.818059e-06 4.540315 5.617012e-06
16 1880 691692 0.298912 0.000302 7110.0 8.776426e-06 4.294659 1.749623e-05
17 1890 660609 0.302099 0.000258 7285.0 2.660525e-05 4.041777 5.304774e-05
18 1900 623915 0.305469 0.000218 7331.0 3.525074e-05 3.975305 7.028916e-05
19 1910 579592 0.312051 0.000157 7367.0 4.380497e-05 3.923283 8.735029e-05
20 1920 534223 0.292366 0.000415 7516.5 1.050574e-04 3.707250 2.095221e-04
21 1930 472919 0.292359 0.000415 7953.0 1.049781e-03 3.076490 2.094533e-03
22 1940 410994 0.273103 0.001008 7938.0 9.759586e-04 3.098166 1.947226e-03
23 1950 331071 0.257789 0.001953 7283.0 2.627829e-05 4.044667 5.239760e-05
24 1960 248180 0.183820 0.028538 7737.0 3.521279e-04 3.388619 7.024563e-04
25 1970 165324 0.138381 0.100519 6806.0 1.104553e-06 4.733951 2.201911e-06

In [ ]:
for df in [gdi_corr_df, sigi_corr_df, wef_corr_df]:
    df.plot(x='start_year',y=['spearman', 'spearman_p'])
    plt.show()

In [87]:
wef_corr_df.to_pickle('opensym/wefdf')

In [2]:
wef_corr_df = pd.read_pickle('opensym/wefdf')

In [9]:
fig, ax = plt.subplots(1, 1, figsize=(6,4))
wef_corr_df.plot(x='start_year',y=['spearman', 'spearman_p'],ax=ax)
ax.set_ylabel('Correlation coefficient')
ax.set_xlabel('Start Year')
ax.legend((r'Spearman $\rho$','Significance $p$'),loc=3)
fig.suptitle('WIGI-GGGI Rank Correlation by Start Year', size=24)
fig.subplots_adjust(top=0.88)
fig.savefig('opensym/spearman_evolution_gggi.png')



In [ ]:
corr_df.plot(x='start_year',y=['mannwhitneyu', 'mannwhitneyu_p'], secondary_y='mannwhitneyu_p')

In [ ]:
corr_df.plot(x='start_year',y=['ranksum', 'ranksum_p'], secondary_y='ranksum_p')

In [ ]:
wdf_matching.sort('Score',ascending=False).head()

In [ ]:
modrecs = allrecs[(allrecs['dob'] >=1890) &(allrecs['dob'] < 1990)]
cdf = modrecs[['country','citizenship','gender']]

def combine_economy(row):
    cit = row['citizenship']
    cunt = row['country']
    return cit if cit else cunt
cdf['Economy_qid'] = cdf.apply(combine_economy,axis=1)
edf = cdf[cdf['Economy_qid'].apply(lambda x: x is not None)]
bios_count = len(edf)

edf['Economy'] = edf['Economy_qid'].apply(english_label)


country_perc = defaultdict(dict)
country_groups= edf.groupby(by='Economy')

for country, group in country_groups:
    nonmale = group[group['gender'] != 'Q6581097']['gender'].count()
    total = group['gender'].count()
    nm_perc = nonmale / float(total)
    country_perc[country]['Economy'] = country #for later on joining
    country_perc[country]['Score'] = nm_perc #for later on joining
    country_perc[country]['total']= total

wdf = pd.DataFrame.from_dict(country_perc, orient='index')

wdf_matching = wdf[wdf['Economy'].apply(lambda x: x in wefnames)]
wdf_matching['Rank'] = wdf_matching['Score'].rank(ascending=False).apply(lambda x: int(x))

rank_compare = wef.join(wdf_matching, on='Economy', how='left', rsuffix='_wikidata')[['Economy','Rank','Rank_wikidata','Score','Score_wikidata']]
rank_compare['diff'] = rank_compare['Rank'] - rank_compare['Rank_wikidata']

In [ ]:
pd.DataFrame.to_html(formatters=)

In [ ]:
print rank_compare.columns

In [ ]:
rank_compare.columns = ['Country', 'WEF Rank', 'Wikipedia Rank','WEF  Score','Wikipedia Score','Rank Difference']

In [ ]:
rank_compare.sort('WEF Rank').head(10).to_html(index=False,formatters={'Wikipedia Score':lambda x: '{:0.4f}'.format(float(x))})

In [ ]:
rank_compare.sort('Wikipedia Rank').to_csv('helpers/foreign_indexes/WIGI_comparison.csv',encoding = 'utf-8', index=False, formatters={'Wikipedia Score':lambda x: '{:0.4f}'.format(float(x))})

In [ ]:
wdfc = wdf[wdf['total'] > 30]

In [ ]:
wdfc['Rank'] = wdfc['Score'].rank(ascending=False).apply(lambda x: int(x))

UNDP's Gender-related Development Index (GDI) and the Gender Empowerment Measure (GEM), introduced only in 1995. More recently, three new measures were developed: the Gender Equity Index (GEI) introduced by Social Watch in 2005, the Global Gender Gap Index (GGGI) developed by the World Economic Forum in 2006, and the Social Institutions and Gender Index (SIGI) of the OECD Development Centre from 2007.


In [ ]:
fiveway = wdfc[['Economy','Score','Rank']]
fiveway.index = fiveway['Economy']
for findex, ftext in zip([sigi,gdi,gei,wef], ['SIGI', 'GDI', 'GEI', 'GGGI']):
    findex.index = findex['Economy']
    fiveway = fiveway.join(findex[['Score','Rank']], how='outer', on = "Economy", rsuffix=" {}".format(ftext))

In [ ]:
fiveway.columns = fiveway.columns[:1] + ['Score WIGI','Rank WIGI'] + fiveway.columns[2:]

In [ ]:
fiveway.sort('Rank').to_csv('helpers/foreign_indexes/WIGI_comparison.csv',encoding = 'utf-8', index=False, formatters={'Wikipedia Score':lambda x: '{:0.4f}'.format(float(x))})

In [ ]:

Quite uncorrellated. That means that the data is not good, or that the world economic forum methods have little to do with the percentage of women born in those countries recorded semantically on a historic level. And /rho is high


In [ ]:
#not clean data bad pdf copy-paste BUT I think that the first string and then SIGI number have copied over 
sigipdftext = '''Belgium 0.0016 0.0038 very low 0.0316 very low 0.0824 low 0.0000 very low 0.0000 very low
France 0.0034 0.1002 low 0.0000 very low 0.0828 low 0.0000 very low 0.0000 very low
Slovenia 0.0037 0.0031 very low 0.0891 very low 0.1023 low 0.0000 very low 0.0000 very low
Spain 0.0049 0.0856 low 0.0622 very low 0.1144 low 0.0000 very low 0.0000 very low
Serbia 0.0097 0.1094 low 0.1171 very low 0.1504 medium 0.0000 very low 0.0000 very low
Argentina 0.0107 0.0809 low 0.0148 very low 0.0691 very low 0.2048 low 0.0000 very low
Italy 0.0116 0.0025 very low 0.1029 very low 0.0966 low 0.0000 very low 0.1951 low
Cuba 0.0208 0.2420 medium 0.0871 very low 0.0000 very low 0.0000 very low 0.1951 low
Trinidad and Tobago 0.0236 0.2504 medium 0.1306 very low 0.0000 very low 0.0000 very low 0.1951 low
Czech Republic 0.0283 0.0013 very low 0.0956 very low 0.0855 low 0.0000 very low 0.3539 medium
Bosnia and Herzegovina 0.0333 0.2437 medium 0.0672 very low 0.1497 medium 0.2048 low 0.1951 low
Belarus 0.0336 0.0251 very low 0.3544 medium 0.0599 very low 0.0000 very low 0.1951 low
Mongolia 0.0345 0.0226 very low 0.2584 medium 0.1582 medium 0.2048 low 0.1951 low
Dominican Republic 0.0367 0.3691 medium 0.0958 very low 0.0118 very low 0.0000 very low 0.1951 low
Panama 0.0375 0.2344 low 0.0148 very low 0.0855 low 0.0000 very low 0.3539 medium
Bolivarian Republic of Venezuela 0.0389 0.2456 medium 0.0941 very low 0.0071 very low 0.0000 very low 0.3539 medium
Ecuador 0.0422 0.1374 low 0.3737 medium 0.1037 low 0.2048 low 0.0000 very low
Lithuania 0.0424 0.0013 very low 0.2795 medium 0.0931 low 0.0000 very low 0.3539 medium
Bulgaria 0.0449 0.1504 low 0.3926 medium 0.0988 low 0.0000 very low 0.1951 low
Brazil 0.0458 0.2316 low 0.1226 very low 0.0364 very low 0.1837 low 0.3539 medium
Cambodia 0.0477 0.0684 low 0.2601 medium 0.0000 very low 0.2028 low 0.3539 medium
El Salvador 0.0490 0.1066 low 0.2675 medium 0.1049 low 0.3885 medium 0.0000 very low
Costa Rica 0.0506 0.2513 medium 0.1544 low 0.0121 very low 0.4076 medium 0.0000 very low
Latvia 0.0511 0.0044 very low 0.3466 medium 0.1008 low 0.0000 very low 0.3539 medium
Plu. St.. of Bolivia 0.0579 0.3676 medium 0.3207 medium 0.0987 low 0.2048 low 0.0000 very low
Paraguay 0.0580 0.2880 medium 0.0440 very low 0.0291 very low 0.4076 medium 0.1951 low
South Africa 0.0599 0.0213 very low 0.2164 low 0.2196 medium 0.4076 medium 0.1951 low
Republic of Moldova 0.0664 0.3418 medium 0.2189 low 0.0000 very low 0.2048 low 0.3539 medium
Romania 0.0686 0.1134 low 0.1700 low 0.0994 low 0.0000 very low 0.5399 high
Azerbaijan 0.2403 0.1301 low 0.2057 low 0.8587 very high 0.1837 low 0.6093 high
Armenia 0.2428 0.1910 low 0.1853 low 0.9880 very high 0.2048 low 0.3539 medium
Ethiopia 0.2450 0.2820 medium 0.8662 very high 0.0878 low 0.5913 high 0.1951 low
Albania 0.2476 0.1822 low 0.2596 medium 0.8767 very high 0.4076 medium 0.4505 medium
Ukraine 0.0750 0.0414 very low 0.1517 low 0.2430 high 0.0000 very low 0.5399 high 
Peru 0.0826 0.4053 medium 0.2096 low 0.0284 very low 0.4076 medium 0.1951 low 
Colombia 0.0862 0.1748 low 0.1567 low 0.0663 very low 0.0000 very low 0.6093 high United
Republic of Tanzania 0.2504 0.7166 very high 0.5415 high 0.1746 medium 0.5913 high 0.2554 low
Lesotho 0.0876 0.4266 high 0.4112 medium 0.2116 medium 0.2048 low 0.0000 very low 
Côte d’Ivoire 0.2537 0.4955 high 0.5895 high 0.1858 medium 0.5913 high 0.5399 high
Madagascar 0.1002 0.4889 high 0.3079 medium 0.0000 very low 0.2048 low 0.3539 medium
Turkey 0.1032 0.1585 low 0.1913 low 0.4036 high 0.0000 very low 0.5399 high 
Timor-Leste 0.2550 0.3882 medium 0.5421 high 0.2271 medium 0.5913 high 0.6552 high
Iraq 0.2631 0.7035 very high 0.3347 medium 0.3834 high 0.5913 high 0.4601 medium
Morocco 0.1052 0.4610 high 0.3159 medium 0.1574 medium 0.3885 medium 0.1951 low 
India 0.2650 0.6440 very high 0.3772 medium 0.5415 very high 0.5913 high 0.3539 medium
Thailand 0.1056 0.3770 medium 0.2935 medium 0.1533 medium 0.3885 medium 0.3539 medium 
Benin 0.2780 0.2763 medium 0.4432 high 0.3677 high 0.5913 high 0.7953 very high
Honduras 0.1074 0.3891 medium 0.1044 very low 0.1443 medium 0.3885 medium 0.4505 medium 
Cameroon 0.2803 0.5024 high 0.5333 high 0.2066 medium 0.7869 very high 0.4505 medium high 
Burkina Faso 0.2819 0.5419 high 0.7257 very high 0.1910 medium 0.5913 high 0.4505 medium
Lebanon 0.2897 0.6143 very high 0.2488 medium 0.1639 medium 0.5913 high 0.7953 very high
Namibia 0.1173 0.1709 low 0.3522 medium 0.0668 very low 0.5913 high 0.2812 low 
Kazakhstan 0.1196 0.0282 very low 0.2176 low 0.1126 low 0.4076 medium 0.6093 high 
Myanmar 0.2935 0.4963 high 0.4891 high 0.0000 very low 0.5913 high 0.7953 
Ghana 0.2988 0.3946 medium 0.5491 high 0.3136 high 0.8044 very high 0.5399 high
Pakistan 0.3013 0.6908 very high 0.4127 medium 0.6998 very high 0.4076 medium 0.4505 medium
People’s Republic of China 0.1310 0.2885 medium
Guatemala 0.1318 0.3953 medium
Rwanda 0.1339 0.2618 medium 0.1246 very low 0.5578 very high 0.4076 0.3213 medium 0.4082 medium 0.2566 high 0.2048 0.1392 medium 0.5913
Former Yugoslav Republic of Macedonia 0.1345 0.1803 low 0.3911 
Jamaica 0.1350 0.0031 very low 0.2046 low 
Mozambique 0.1375 0.4181 high 0.3793 medium 
Zimbabwe 0.1392 0.5700 very high 0.3435 medium 0.2951
Tajikistan 0.1393 0.3182 medium 0.4138 medium 0.5075 medium 0.5666 medium 0.2812 low low 0.5399 high high 0.2554 low very high 0.4076 medium 0.0271 very low 0.0000 0.0000 very low 0.4076 high very high 
Jordan 0.3119 0.5274 high 0.3150 medium 0.6790 very high 0.5913 high 0.6093 high
Guinea 0.3206 0.5413 high 0.9515 very high 0.2253 medium 0.3885 medium 0.4505 medium
Afghanistan 0.3224 0.7316 very high 0.5473 high 0.4644 very high 0.5913 high 0.4601 medium
Nepal 0.3229 0.1813 low 0.4083 medium 1.0000 very high 0.5913 high 0.2554 low
Central African Rep. 0.3285 0.5327 high 0.6135 high 0.0071 very low 0.5913 high 0.7953 very high
Bangladesh 0.3900 0.9730 very high 0.3323 medium 0.5831 very high 0.5913 high 0.4505 medium 0.2028 low 0.3539 medium 
Nigeria 0.3911 0.6723 very high 0.4766 high 0.2494 high 0.7626 very high 0.7953 very high
Mauritania 0.3954 0.7556 very high 0.9939 very high 0.1746 medium 0.5913 high 0.1951 low
Gabon 0.4022 0.6457 very high 0.5308 high 0.1746 medium 0.7869 very high 0.8140 very high
Syrian Arab Republic 0.4162 0.6914 very high 0.2598 medium 0.4312 high 0.5913 high 1.0000 very high
Lao People’s Democratic Republic 0.1445 0.2606 medium 0.5321 high 0.0506 very low 0.4076 medium 0.4505 medium
Haiti 0.1466 0.5613 very high 0.5010 high 0.0000 very low 0.2048 low 0.3539 medium
Uzbekistan 0.1475 0.2477 medium 0.2966 medium 0.1884 medium 0.5913 high 0.4505 medium
Indonesia 0.1532 0.5612 very high 0.2511 medium 0.3891 high 0.1837 low 0.4505 medium
Nicaragua 0.1595 0.6303 very high 0.1868 low 0.1082 low 0.3885 medium 0.4505 medium
Kyrgyzstan 0.1598 0.1879 low 0.3771 medium 0.2624 high 0.5913 high 0.4505 medium
Burundi 0.1662 0.5602 very high 0.5055 high 0.1746 medium 0.4076 medium 0.2554 low
Angola 0.1719 0.4599 high 0.5041 high 0.0791 low 0.5913 high 0.1951 low
Philippines 0.1765 0.4929 high 0.2597 medium 0.1392 medium 0.5913 high 0.4505 medium
Togo 0.1860 0.3696 medium 0.5488 high 0.1326 medium 0.5913 high 0.3539 medium
Viet Nam 0.1865 0.3374 medium 0.1857 low 0.4967 very high 0.4076 medium 0.6093 high
Sri Lanka 0.1894 0.4203 high 0.2681 medium 0.1483 medium 0.6207 high 0.5399 high
Democratic Republic of the Congo 0.4276 0.5169 high 0.5338 high 0.0691 very low 0.9582 very high 0.8140 very high
Egypt 0.4280 0.6665 very high 0.7373 very high 0.3741 high 0.5913 high 0.8140 very high
Niger 0.4415 1.0000 very high 0.4059 medium 0.1746 medium 0.5913 high 0.8140 very high
Zambia 0.4489 0.5149 high 0.5624 high 0.1746 medium 1.0000 very high 0.7953 very high
Somalia 0.4594 0.5958 very high 0.9905 very high 0.0891 low 0.7626 very high 0.6093 high
Chad 0.4665 0.9705 very high 0.8185 very high 0.0014 very low 0.5913 high 0.6093 high
Mali 0.5164 0.8309 very high 1.0000 very high 0.3048 high 0.4076 medium 0.7953 very high very high
Gambia 0.5240 0.5131 high 0.8509 very high 0.0000 very low 1.0000 very high 0.7953 
Sudan 0.5550 0.8382 very high 0.9781 very high 0.1426 medium 0.8163 very high 0.6552 high very high 0.3414 high 0.5913 high 1.0000 very high'''

In [ ]:
modrecs = allrecs[(allrecs['dob'] >= 1900) &(allrecs['dob'] < 1990)]
cdf = modrecs[['country','citizenship','gender']]

def combine_economy(row):
    cit = row['citizenship']
    cunt = row['country']
    return cit if cit else cunt
cdf['Economy_qid'] = cdf.apply(combine_economy,axis=1)
edf = cdf[cdf['Economy_qid'].apply(lambda x: x is not None)]
bios_count = len(edf)

edf['Economy'] = edf['Economy_qid'].apply(english_label)


country_perc = defaultdict(dict)
country_groups= edf.groupby(by='Economy')

for country, group in country_groups:
    nonmale = group[group['gender'] != 'Q6581097']['gender'].count()
    total = group['gender'].count()
    nm_perc = nonmale / float(total)
    country_perc[country]['Economy'] = country #for later on joining
    country_perc[country]['Score'] = nm_perc #for later on joining
    country_perc[country]['total']= total

wdf = pd.DataFrame.from_dict(country_perc, orient='index')

In [ ]:
wdf[wdf['total']>100].tail(100)

In [ ]:
wdf.ix["People's Republic of China"]

In [ ]:
#magnus' special format
nonzero = wdf[(wdf['Score'] != 0.0) & (wdf['total']> 100)]
magnusformt = zip(nonzero['Economy'],nonzero['Score'])
json.dump(magnusformt, open('Magnus Gender analysis/wigi_gender.json','w'))

In [ ]:
!less Magnus\ Gender\ analysis/wigi_gender.json

In [ ]: